##Isolate Pairs < 1000 SNVs

snv_dists_Penn <- get_snv_dists(dists_Penn, locs_Penn, pt_Penn)
## Warning in check_dists_vs_locs(dists, locs): You have supplied a list of more
## isolates (n = 459 ) with locations than exist in your SNV distance matrix (n =
## 449 ). Will subset
## Warning in check_dists_vs_locs(dists, locs): You have provided an isolate
## location vector of fewer isolates than are contained in your SNV distance matrix
## (dists). Will subset
snv_dists_df_Penn <- as.data.frame(snv_dists_Penn)
colors <- c("Intra-facility pair" = "red", "Inter-facility pair" = "blue")
ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2) + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.5) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + facet_wrap(~ Loc1, ncol = 4) + ggtitle("All Penn Isolate Pairs < 1000 SNV distance by facility")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2) + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.5) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 1000 SNV distance")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##Isolate Pairs < 20 SNVs

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2, binwidth = 1) + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2, binwidth = 1) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 20 SNV distance")

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2, binwidth = 1) + 
  geom_histogram(data = subset(snv_dists_df_Penn,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2, binwidth = 1) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 20 SNV distance by Facility") + facet_wrap(~ Loc1, ncol = 4)

##Isolate Pairs Over time

#add time to the snv_dists_df_Penn thing 
metadata_penn$isolate_char <- paste0("PCMP_H", metadata_penn$isolate_no)
#add the date of isolate #1 to the snv_dists_df_Penn 
snv_dists_df_Penn_dated <- snv_dists_df_Penn %>% left_join(metadata_penn, by = c("Isolate1" = "isolate_char"))
ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2, binwidth = 1) + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2, binwidth = 1) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 20 SNV distance by Date of first isolate") + facet_wrap(~ cx_date, ncol = 4)

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2) + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 1000 SNV distance by Date of first isolate") + facet_wrap(~ cx_date, ncol = 4)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2) + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 1000), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 1000 SNV distance by Date by Location of first isolate") + facet_wrap(Loc1 ~ cx_date)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2, binwidth = 1) + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2, binwidth = 1) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 20 SNV distance by Date by Location of first isolate") + facet_wrap(Loc1 ~ cx_date)

facility_sum <- which(unlist(table(locs_Penn) > 10))
snv_dists_df_Penn_dated_sub <- snv_dists_df_Penn_dated %>% filter(Loc1 %in% names(facility_sum))

ggplot() + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated_sub,Pair_Type == "Inter-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Inter-facility pair"), alpha = 0.2, binwidth = 1) + 
  geom_histogram(data = subset(snv_dists_df_Penn_dated_sub,Pair_Type == "Intra-facility pair" & Pairwise_Dists < 20), mapping = aes(x = Pairwise_Dists, fill = "Intra-facility pair"), alpha = 0.2, binwidth = 1) + labs(x = "Pairwise SNV Distance", y = "Count", fill = "Legend") + scale_color_manual(values = colors) + ggtitle("All Penn Isolate Pairs < 20 SNV distance by Date by Location of first isolate") + facet_grid(Loc1 ~ cx_date)

#plot how many isolates are present in each facil in each month 
n_isolates <- snv_dists_df_Penn_dated %>% group_by(cx_date, Loc1) %>% summarize(n = n())
## `summarise()` regrouping output by 'cx_date' (override with `.groups` argument)
ggplot(data = n_isolates) + geom_bar(mapping = aes(x = cx_date, y = n, fill = Loc1), stat = "identity", position = "dodge") + labs(x = "Date (month)", y = "Number of isolates from facility", title = "Distribution of number of isolates by facility over time ")

get_clusters

#run get clusters
#subset locs_Penn to have the same as tr_Penn
common <- intersect(tr_Penn$tip.label, names(locs_Penn))
locs_Penn_tr_sub <- locs_Penn[common]
clusters <- get_clusters(tr_Penn,locs_Penn_tr_sub)
## Warning in check_tr_vs_locs(tr, locs): You have provided an isolate location
## vector of fewer isolates than are contained in your SNV distance matrix (dists).
## Will subset
#dissect output
pure_subtree_info <- clusters$pure_subtree_info
subtrees <- clusters$subtrees

ggplot(data = pure_subtree_info, aes(x = f_id, y = subtr_size)) + geom_jitter(position = position_jitter(width = 0.2, height = 0.1), alpha = 0.2) + labs(y = "Number of Isolates in \nPhylogenetic Cluster", x = "Facility ID", title = "Pure Subtree Information")

Plot the tree with location annotation, ST 258 only

#remove the non-st258 tips 
#read in kleborate results
locs <- as.data.frame(locs_Penn_tr_sub)
locs$ID <- rownames(locs)
kleb <- read.csv("/Users/sophiehoffman/Desktop/gl_mount/Project_Penn_KPC/Sequence_data/Reports/kleborate/penn_kleborate_results.txt", sep = "\t")
penn_locs_sub <- locs %>% left_join(kleb, by = c("ID" = "strain")) %>% filter(ST == "ST258") %>% select(ID, locs_Penn_tr_sub)
colnames(penn_locs_sub) <- c("ID", "f_id")

#subset the tree
tr_sub <- keep.tip(tr_Penn,penn_locs_sub$ID)

tr_Penn_mpr = midpoint.root(tr_sub)
gheatmap(ggtree(tr_Penn_mpr, layout = "circular"),as.data.frame(locs_Penn_tr_sub), color = NA)